# SAMPLE NAME
## specify sample name
sample.name <- c("beau", "ophio_cflo")
## specify names for all ocflo samples
sampleName <- c("ophio_cflo","ophio_ophio-infected")
## color scheme for the samples
col.scheme <- c("#5A829F", "#AD212F", "#5C2849")
# col.scheme <- c("#5A829F", "#AD212F", "black", "#5C2849")

## annotation files
annots <- list(beau_annots,ophio_cflo_annots)

# SCRIPT NAME
## specify the name of the script (folder) where figures will be saved
script.name <- "01_comparing_gene_exp_ophio_beau"

# eJTK OUTPUT
## Set GammaP threshold below which genes are classified as rhythmic
gamma.pval = 0.05
## Set false discovery rate for functional enrichment analyses
FDR = 5

## Set a counter for supplementary excel files
counter <- 0

## List of metabolic processes (only top level metabolic process GO terms)
# source: http://www.informatics.jax.org/vocab/gene_ontology/GO:0008150

metabolism.gos <- c(metabolic_process = c("GO:0008152", "GO:0044236", "GO:0044710"),
                 adaptive_thermogenesis = c("GO:1990845"),
                 antibiotic_metabolic_process = "GO:0016999",
                 ATP_metabolic_process = "GO:0046034",
                 biosynthetic_process = c("GO:0009058", "GO:0044274", "GO:0044711"),
                 catabolic_process = c("GO:0009056", "GO:0044243", "GO:0044712"),
                 cellular_metabolic_process = "GO:0044237",
                 collagen_biosynthesis = "GO:0032964",
                 collagen_catabolic_process = "GO:0030574",
                 hormone_biosynthetic_process = "GO:0042446",
                 hormone_catabolic_process = "GO:0042447",
                 negative_regulation_of_hormone_metabolism = "GO:0032351",
                 positive_regulation_of_hormone_metabolism = "GO:0032352",
                 peptide_hormone_processing = "GO:0016486",
                 regulation_of_hormone_metabolism = "GO:0032350",
                 insulin_processing = "GO:0030070",
                 pheromone_metabolism = "GO:0042810",
                 carbohydrate_utilization = "GO:0009758")

locomotion.gos <- c(cell_motility = "GO:0048870",
                    directional_locomotion = "GO:0033058",
                    flight = "GO:0060361",
                    locomotory_behavior = c("GO:0031987", "GO:0007626"),
                    regulation_of_locomotion = "GO:0040012",
                    multicellular_organismal_locomotion = "GO:0071965",
                    negative_regulation_of_locomotion = "GO:0040013",
                    positive_regulation_of_locomotion = "GO:0040017",
                    taxis = "GO:0042330"
                    )

immune.gos <- c(immune_system_process = "GO:0002376",
                activation_of_immune_response = "GO:0002253",
                negative_regulation_of_immune_system = "GO:0002683",
                positive_regulation_of_immune_system = "GO:0002684",
                regulation_of_immune_system = "GO:0002682",
                immune_effector_process = "GO:0002252",   
                # Any process of the immune system that executes a component of an immune response. An effector immune process takes place after its activation.
                immune_response = "GO:0006955",
                immunological_memory_process = "GO:0090713",
                tolerance_induction = "GO:0002507",
                detoxification = "GO:0098754",
                toxin_catabolic_process = "GO:0009407",
                xenobiotic_detoxification = c("GO:0140330", "GO:1990961"),
                nitrogen_utilization = "GO:0019740")

interspecies.gos <- c(interspecies_interaction = c("GO:0044419", "GO:0051703"),
                      biological_regulation = "GO:0065007",
                      regulation_of_biological_quality = "GO:0065008",
                      signaling = c("GO:0023052", "GO:0023046", "GO:0044700"),
                      viral_process = c("GO:0016032", "GO:0022415"))

behavior.gos <- c(behavior = c("GO:0007610", "GO:0044708"),
                  rhythmic_behavior = "GO:0007622",
                  social_behavior = "GO:0035176",
                  thermosensory_behavior = "GO:0040040",
                  visual_behavior = "GO:0007632",
                  aggressive_behavior = "GO:0002118",
                  behavioral_defense_response = "GO:0002209",
                  behavioral_response_to_nutrient = "GO:0051780",
                  response_to_water_deprivation = "GO:0042630",
                  chemosensory_behavior = "GO:0007635",
                  olfactory_behavior = "GO:0042048",
                  exploration_behavior = "GO:0035640",
                  feeding_behavior = c("GO:0007631", "GO:0044366", "GO:0044367", "GO:0044368",
                                       "GO:0044369", "GO:0044370", "GO:0044371", "GO:0044372"),
                  foraging_behavior = c("GO:0060756"),
                  grooming_behavior = c("GO:0007625"),
                  host_seeking_behavior = "GO:0032537",
                  learning_memory = "GO:0007611",
                  mechanosensory_behavior = "GO:0007638",
                  response_to_stimulus = c("GO:0050896", "GO:0051869"),
                  motor_behavior = "GO:0061744",
                  negative_regulation_of_behavior = "GO:0048521",
                  positive_regulation_of_behavior = "GO:0048520",
                  regulation_of_behavior = "GO:0050795",
                  reproductive_behavior = "GO:0019098",
                  reproduction = c("GO:0000003", "GO:0019952", "GO:0050876"),
                  rhythmic_process = "GO:0048511")

gut.microbiota.gos <- c(sulfur_metabolism = "GO:0006790",
                        sulfur_amino_acid_metabolism = "GO:0000096",
                        regulation_of_sulfur_metabolism = "GO:0042762",
                        negative_regulation_of_sulfur_metabolism = "GO:0051175",
                        positive_regulation_of_sulfur_metabolism = "GO:0051176")


list.of.gos <- list("metabolism_gos" = metabolism.gos,
                    "locomotion_gos" = locomotion.gos,
                    "immune_gos" = immune.gos,
                    "interspecies_gos" = interspecies.gos,
                    "behavior_gos" = behavior.gos)

Loading databases

# LOAD DATABASES (TC7)
# 1. TC6_ejtk.db
# Desc: This database contains all ejtk-output for TC6
ejtk.db <- dbConnect(RSQLite::SQLite(),
                     paste0(path_to_repo,"/data/databases/TC6_fungal_ejtk.db"))
# which tables are in the database
src_dbi(ejtk.db)
## src:  sqlite 3.29.0 [/Users/biplabendudas/Documents/GitHub/Das_et_al_2022a/data/databases/TC6_fungal_ejtk.db]
## tbls: beau_rhythmic_genes_12h, beau_rhythmic_genes_24h, beau_zscores_08h,
##   beau_zscores_12h, beau_zscores_24h, ophio_cflo_rhythmic_genes_12h,
##   ophio_cflo_rhythmic_genes_24h, ophio_cflo_zscores_08h,
##   ophio_cflo_zscores_12h, ophio_cflo_zscores_24h, ophio_kim_DD_zscores_24h,
##   ophio_kim_LD_rhythmic_genes_24h, ophio_kim_LD_zscores_24h
#
# 2. TC6_data.db
data.db <- dbConnect(RSQLite::SQLite(),
                     paste0(path_to_repo,"/data/databases/TC6_fungal_data.db"))
src_dbi(data.db)
## src:  sqlite 3.29.0 [/Users/biplabendudas/Documents/GitHub/Das_et_al_2022a/data/databases/TC6_fungal_data.db]
## tbls: beau_expressed_genes, beau_fpkm, beau_log2fpkm, beau_zscores,
##   ophio_cflo_expressed_genes, ophio_cflo_fpkm, ophio_cflo_log2fpkm,
##   ophio_cflo_zscores, ophio_kim_DD_expressed_genes, ophio_kim_DD_fpkm,
##   ophio_kim_DD_log2fpkm, ophio_kim_DD_zscores, ophio_kim_expressed_genes,
##   ophio_kim_fpkm, ophio_kim_log2fpkm, ophio_kim_zscores
#
# 3. TC7_data.db
inf.db <- dbConnect(RSQLite::SQLite(),
                   paste0(path_to_repo,"/../Das_et_al_2022b/data/databases/TC7_data.db"))
#
# 4. TC7_ejtk.db
inf.ejtk.db <- dbConnect(RSQLite::SQLite(),
                   paste0(path_to_repo,"/../Das_et_al_2022b/data/databases/TC7_ejtk.db"))
#
#

Overview/Goals

1. General patterns of gene expression

# number of all genes
all.genes <- list()
for (i in 1:length(sample.name)) {
  all.genes[[i]] <- tbl(data.db, paste0(sample.name[[i]] ,"_fpkm")) %>%  
    collect()
  
  writeLines(paste("Number of genes in", sample.name[[i]], ":", nrow(all.genes[[i]])))
}
## Number of genes in beau : 10364
## Number of genes in ophio_cflo : 7455
# A1: genes that have NO expression (FPKM == 0 at all time points)
not.expressed <- list()
for (i in 1:length(sample.name)) {
  not.expressed[[i]] <-
    tbl(data.db, paste0(sample.name[[i]] ,"_fpkm")) %>% 
    collect() %>% 
    filter_at(vars(starts_with("Z")), all_vars(. == 0)) %>%
    pull(gene_name)
  
  # How many genes are not expressed?
  writeLines(paste("n(genes-NOT-EXPRESSED) in", sample.name[[i]], ":", length(not.expressed[[i]])))
  
}
## n(genes-NOT-EXPRESSED) in beau : 759
## n(genes-NOT-EXPRESSED) in ophio_cflo : 190
# A2: run enrichment (make plot of enrichment found of non-expressed genes)

### Gene set of interest 1 ###
gsoi.1 <- list()
# names(gsoi.1) <- "not_expressed_genes"

## loop STARTS
for (i in 1:length(sample.name)) {
  writeLines(paste("running GO enrichment for NOT-EXPRESSED genes in", sample.name[[i]]))
  
  # run enrichment
  not.expressed[[i]] %>% 
    
    check_enrichment(.,
                      org = sample.name[[i]],
                      what = "pfams",
                      bg = 'all',
                      expand = T) %>% 
    
    # which enterotoxin genes are not expressed in culture
    # filter(annot_desc=="Enterotoxin_a") %>% 
    
    # # pull gene names for a given GO term
    # separate_rows(., gene_name, sep = ", ") %>%
    # filter(GO == "GO:0009405") %>% # pathogenesis
    # # filter(GO == "GO:0090729") %>% # toxin activity
    # # filter(GO == "GO:0044419") %>% # interspecies interaction between organisms
    # # filter(GO == "GO:0020037") %>% # heme binding
    # pull()
    
    # nothing much to talk about things that have no annotations ---
    filter(annot_desc!="no_desc") %>%
    
    # save the results to a file
    arrange(annot_desc) ->  gsoi.1[[i]]
  
  # MAKE THE SUPP FILE
  foo <-
    annots[[i]] %>%
    select(gene_name, gene_desc, GOs, pfams, signalP, SSP, TMHMM) %>% 
    filter(gene_name %in% not.expressed[[i]]) %>% 
    left_join(gsoi.1[[i]][,c(1,3)], by="gene_name") %>% 
    select(gene_name, gene_desc, enriched_annot=annot_desc, everything()) %>%
    arrange(enriched_annot)
  gsoi.1[[i]] <- foo
  
    
    # print()
  
}
## running GO enrichment for NOT-EXPRESSED genes in beau
## [1] "Loading annotation file for Beauveria bassiana"
## [1] "Done."
## [1] "Testing for enrichment..."

## running GO enrichment for NOT-EXPRESSED genes in ophio_cflo
## [1] "Loading annotation file for Ophiocordyceps camponoti-floridani"
## [1] "Done."
## [1] "Testing for enrichment..."

### loop ENDS
# ## Save results of not_expressed genes into an excel file
# supp <- list(
#   "not_expressed_beau" = gsoi.1[[1]],
#   "not_expressed_ophio" = gsoi.1[[2]])
# writexl::write_xlsx(supp,
#                     path = paste0(path_to_repo,"/results/00_supplementary_files/01_gsoi_not_expressed_beau_ophio.xlsx"
#                                   )
#                     )
# # update counter
# counter <- counter+1

### QUERY ###
#
### Enterotoxin genes that are not expressed in cultures; are they expressed, even lowly, during infection?
#
## ocflo genes are expressed during infection (fpkm ≥ 1 for at least one time point)
expressed.ophio.inf <-
  tbl(inf.db, paste0(sampleName[[2]],"_expressed_genes")) %>% 
    filter(expressed=="yes") %>% 
    collect() %>% 
    pull(gene_name)
## ocflo genes not expressed during infection (fpkm = 0)
not.expressed.ophio.inf <-
  tbl(inf.db, paste0(sampleName[[2]] ,"_fpkm")) %>% 
    collect() %>% 
    filter_at(vars(starts_with("Z")), all_vars(. == 0)) %>%
    pull(gene_name)
## check intersection (are some expressed?)
g <- setdiff((gsoi.1[[2]] %>% filter(enriched_annot=="Enterotoxin_a") %>% pull(gene_name)), not.expressed.ophio.inf)
# intersect(g, expressed.ophio.inf) # all three are lowly expressed during infection
#
### result:
# two of the five show no expression during infeccion
# three others show only low expression (0 < fpkm < 1)
### query ends


### QUERY ###
#
### p450 genes that are not expressed in cultures; are they expressed, even lowly, during infection?
#
## check intersection (are some expressed?)
g <- setdiff((gsoi.1[[2]] %>% filter(enriched_annot=="p450") %>% pull(gene_name)), not.expressed.ophio.inf)
# intersect(g, expressed.ophio.inf) # all three are lowly expressed during infection
#
### result:
# five of the seven show no expression during infeccion
# two others show only low expression (0 < fpkm < 1)
### query ends


# B: genes that are expressed (FPKM > 1 for at least one time point)
expressed <- list()
for (i in 1:length(sample.name)) {
  expressed[[i]] <- 
    tbl(data.db, paste0(sample.name[[i]],"_expressed_genes")) %>% 
    filter(expressed=="yes") %>% 
    collect() %>% 
    pull(gene_name) 
  
  # How many genes are expressed?
  writeLines(paste("n(EXPRESSED) in", sample.name[[i]], ":", length(expressed[[i]])))
}
## n(EXPRESSED) in beau : 9006
## n(EXPRESSED) in ophio_cflo : 6998
expressed.okim <-
  tbl(data.db, paste0("ophio_kim","_expressed_genes")) %>% 
    filter(expressed=="yes") %>% 
    collect() %>% 
    pull(gene_name)

2. Daily rhythms in gene expression

## Load all the rhythmic genesets 
## Note, ordered according to their p-value; highly rhythmic at the top.
#
# Choose period
period = '24'

## 
rhy <- list()
for (i in 1:2) {
  rhy[[i]] <-
    tbl(ejtk.db, paste0(sample.name[[i]],"_zscores_",period,'h')) %>%
    filter(GammaP < gamma.pval) %>%
    select(ID, GammaP) %>% collect() %>% arrange(GammaP) %>%
    select(ID) %>% pull()
  
  # How many genes are rythmic?
  writeLines(paste0("n(rhythmic-",period, "h) in ", sample.name[[i]], " : ", length(rhy[[i]])))
}
## n(rhythmic-24h) in beau : 1872
## n(rhythmic-24h) in ophio_cflo : 2285
rhy24.okim <-
  tbl(ejtk.db, paste0("ophio_kim_LD","_zscores_",period,'h')) %>%
    filter(GammaP < gamma.pval) %>% 
    select(ID, GammaP) %>% collect() %>% arrange(GammaP) %>% 
    select(ID) %>% pull()

Hierarchical clustering of rhy24

  • perform hierarchical clustering of 24h-rhythmic genes into four clusters;
  • plot time-course heatmaps for the clustered 24h-rhythmic geneset
  • Identify the day-peaking and night-peaking clusters visually.
## initialise lists to hold input and output of the hierarchical clustering
zscore.dat <- list() # zscore data (input)
my_gene_col <- list() # cluster identity for each rhythmic gene (output)
rhy.heat <- list() # pheatmap that can be saved/plotted (output)

# specify number of clusters
n_clusters <- 4

## run clustering and plot
for (i in 1:2) {
  ## load zscore dataset
  zscore.dat[[i]] <- data.db %>% tbl(., paste0(sample.name[[i]],"_zscores")) %>% collect()
  
  # Filter the zscores to keep only rhythmic genes
  zscore.rhy <-
    zscore.dat[[i]] %>% 
    filter(gene_name %in% rhy[[i]]) %>% 
    as.data.frame()
  
  # Set genes as rownames and convert it into a matrix
  rownames(zscore.rhy) = zscore.rhy$gene_name
  zscore.rhy <- as.matrix(zscore.rhy[-1])
  
  
  # Hierarchical clustering of the genesets
  my_hclust_gene <- hclust(dist(zscore.rhy), method = "complete")
  
  
  # Make annotations for the heatmaps
  my_clusters <- cutree(tree = as.dendrogram(my_hclust_gene), k = n_clusters) # k=  clusters
  my_gene_col[[i]] <- data.frame(cluster = my_clusters)
  
  
  # I’ll add some column annotations and create the heatmap.
  # Annotations for:
  # 1. Is the sample collected during the light or dark phase? 
  my_sample_col <- data.frame(phase = rep(c("light", "dark", "light"), c(5,6,1)))
  row.names(my_sample_col) <- colnames(zscore.rhy)
  
  # Manual color palette
  my_colour = list(
    phase = c(light = "#F2E18D", dark = "#010440"),
    cluster = viridis::cividis(100)[c(seq(10,80,by=round(80/(n_clusters), 0)))])
  
  # Color scale
  my.breaks = seq(min(zscore.rhy), max(zscore.rhy), by=0.1)
  # my.breaks = seq(min(zscore.rhy), max(zscore.rhy), by=0.06)
  
  # Let's plot!
  rhy.heat[[i]] <-
    pheatmap(zscore.rhy, show_rownames = F, show_colnames = F,
             annotation_row = my_gene_col[[i]], 
             annotation_col = my_sample_col,
             cutree_rows = n_clusters, # OG was 4
             # cutree_cols = 2,
             annotation_colors = my_colour,
             border_color=FALSE,
             cluster_cols = F,
             breaks = my.breaks,
             ## color scheme borrowed from: 
             color = inferno(length(my.breaks) - 1),
             treeheight_row = 0,
             # treeheight_col = 0,
             # remove the color scale or not
             main = paste0(sample.name[[i]], " 24h-rhythmic \n (n=", nrow(zscore.rhy), " genes)"),
             ## annotation legend
             annotation_legend = T,
             ## Color scale
             legend = T)
  
}

Phase plots

rhy.24.sig <- list()
phase.ejtk <- list()

# Obtain the phases of 24h-rhythmic genes beau v. ophio_cflo
for (i in 1:3) {

  if (i != 3) {
  rhy.24.sig[[i]] <- 
    tbl(ejtk.db, paste0(sample.name[i],"_zscores_24h")) %>% 
    filter(GammaP < gamma.pval) %>% 
    collect()

  } else {
    rhy.24.sig[[i]] <- 
      tbl(inf.ejtk.db, paste0(sampleName[2], "_zscores_24h")) %>% 
      filter(GammaP < gamma.pval) %>% 
      collect()
  }
  
  # Get the phases of the best matched waveforms
  phase.ejtk[[i]] <- circular::circular(rhy.24.sig[[i]]$Phase, 
                                        units="hours", template="clock24")

# # Get the time-of-day of expression peak
# phase.ejtk[[i]] <- circular::circular(rhy.24.sig[[i]]$MaxLoc, units="hours", template="clock24")
# # Get the time-of-day of expression trough
# phase.ejtk[[i]] <- circular::circular(rhy.24.sig[[i]]$MinLoc, units="hours", template="clock24")

}

# ## Save the rhy.24.sig file for making a gene annotator function (TC6_annotator)
# save(rhy.24.sig, file = paste0(path_to_repo, "/results/ejtk_output/rhy24_sig_all.Rdata"))


#### Phase Plot ####

# save all the circular phases in a list
l.phases <- phase.ejtk
# let's name the list elements for later use and reference
names(l.phases) <- c(sample.name[1:2],sampleName[2]) # includes ocflo during infection
# names(l.phases) <- c(sample.name[1:2]) # does not have ocflo infection data

# writeLines("Performing Watson test to check if the average peak of 24h-rhythms in Beau and Ophio-cflo differs significantly")
# # For all rhy genes
# beau.ophio <- watson.two.test(l.phases[[1]],l.phases[[2]], alpha = FDR/100)
# writeLines("Beau v. Ophio-cflo")
# beau.ophio %>% print()

## Plot the phase distributions

# Initialize a list for saving the ggplots
g <- list()

means <- as.numeric(lapply(phase.ejtk, mean))
means <- circular(means, units="hours", template="clock24")


for(i in 1:length(l.phases)) {
  
  # define phase levels
  ordered_phases <- c("2","4","6","8","10","12",
                      "14","16","18","20","22","24")
    
  df.test <- l.phases[[i]] %>%
    as.data.frame() %>% 
    mutate(phase = x) %>%
    mutate(phase = replace(phase, x=="0", "24")) %>% 
    select(-x) %>% 
    group_by(phase = factor(phase, levels = ordered_phases)) %>%
    summarise(n_genes = n())

  m <- as.numeric(means[i])
    
  g[[i]] <-
    ggplot(df.test, aes(x=factor(phase), y=n_genes)) + 
    # indicate light-dark phase
    geom_rect(aes(xmin = as.factor(12), xmax = as.factor(24), ymin = -Inf, ymax = Inf),
              fill = "grey80", alpha = 0.05, color=NA) +
    geom_bar(stat='identity', fill=col.scheme[[i]]) +
    
    xlab(c(names(l.phases)[i])) +
    
    scale_y_continuous(n.breaks = 3) +
    # scale_y_continuous(breaks = c(0,300,600), limits = c(0,700)) +
    
    coord_polar() +
    theme_Publication(base_size = 20) +
    theme(# axis.title.x=element_blank(),
              # axis.text.x=element_blank(),
              legend.position = "none")
    #ggtitle(paste0("Dataset: ", names(l.phases)[i])) 
  
}

ggpubr::ggarrange(plotlist=g,
                  nrow = 1, ncol = 2,
                  widths = c(1,1), labels = NA)
## $`1`

## 
## $`2`

## 
## attr(,"class")
## [1] "list"      "ggarrange"

Clusters of rhythmic genes

  • Which processes are identified clusters of rhythmic genes involved in?
  • How do they fluctuate throughout the day?

Beau - 24h

Not going to discuss this in primary text; only as a supplementary file

# ### Gene set of interest 2 ###
# gsoi.2 <- list()
# 
# for (i in 1:n_clusters){
#   
#   writeLines(paste0("Species: ", sample.name[[1]], "\n", "24h-rhythmic genes, Cluster: ", i))
#   
#   # Summary
#   genes <- my_gene_col[[1]] %>% rownames_to_column("g") %>% filter(cluster==as.character(i)) %>% pull(g)
#   writeLines(paste0("n(genes) = ", length(genes),"\n"))
#   
#   # Enrichment
#   gsoi.2[[i]] <-
#     genes %>% 
#       check_enrichment(.,
#                     what = "pfams",
#                     org = sample.name[[1]],
#                     bg = expressed[[1]],
#                     filter = T,
#                     plot = T,
#                     expand = T) %>% 
#     
#     # nothing much to talk about things that have no annotations ---
#     filter(annot_desc!="no_desc") %>%
#     
#     # save the results to a file
#     arrange(annot_desc)
#     
#   # writeLines(paste0("\n", "n(overrepresented terms) = ", nrow(gsoi.2[[i]]), "\n"))
#   # print(overrepresented.terms[[i]] %>% as_tibble())
# 
#   # Stacked zplot
#   genes %>% 
#   stacked.zplot_tc6(cond = "beau", plot.mean = F) %>% 
#     multi.plot(rows = 1, cols = 1)
#   
#   # MAKE THE SUPP FILE
#   gsoi.2[[i]] <-
#     annots[[1]] %>%
#     select(gene_name, gene_desc, GOs, pfams, signalP, SSP, TMHMM) %>% 
#     filter(gene_name %in% genes) %>%  # change here
#     left_join(gsoi.2[[i]][,c(1,3)], by="gene_name") %>%  # change here
#     select(gene_name, gene_desc, enriched_annot=annot_desc, everything()) %>%
#     # mutate(enriched_annot=as.factor(enriched_annot)) %>% 
#     arrange(enriched_annot)
# }
# 
# ## Save results of not_expressed genes into an excel file
# supp <- list(
#   "rhy24_beau_cluster1" = gsoi.2[[1]],
#   "rhy24_beau_cluster2" = gsoi.2[[2]],
#   "rhy24_beau_cluster3" = gsoi.2[[3]],
#   "rhy24_beau_cluster4" = gsoi.2[[4]]
#   )
# writexl::write_xlsx(supp,
#                     path = paste0(path_to_repo,"/results/00_supplementary_files/02_gsoi_rhy24_beau.xlsx"
#                                   ))

# # update counter
# counter <- counter+1

Ophio-cflo - 24h

GOAL: Which cluster(s) contains the rhy24-genes overlapping between Ocflo and Okim?

### Gene set of interest 3 ###
gsoi.3 <- list()
# names(gsoi.3) <- "rhy24_genes_in_ocflo"

## initialise list to hold phase data for rhythmic genes
l.phases <- list()

for (i in 1:n_clusters){
  
  writeLines(paste0("Species: ", sample.name[[2]], "\n", "24h-rhythmic genes, Cluster: ", i))
  
  # Summary
  genes <- my_gene_col[[2]] %>% rownames_to_column("g") %>% filter(cluster==as.character(i)) %>% pull(g)
  writeLines(paste0("n(genes) = ", length(genes),"\n"))
  
  # Enrichment
  if(i==3) {
    gsoi.3[[i]] <-
    genes %>% 
      check_enrichment(.,
                    function.dir = path_to_repo,
                    # what = "pfams",
                    what = "GOs",
                    org = sample.name[[2]],
                    bg = expressed[[2]],
                    filter = T,
                    plot = F,
                    # plot = T,
                    expand = T) %>% 
    
    # nothing much to talk about things that have no annotations ---
    filter(annot_desc!="no_desc") %>%
    # save the results to a file
    arrange(annot_desc) %>% 
    as.data.frame()
  } else {
    gsoi.3[[i]] <-
    genes %>% 
      check_enrichment(.,
                    function.dir = path_to_repo,
                    what = "pfams",
                    # what = "GOs",
                    org = sample.name[[2]],
                    bg = expressed[[2]],
                    filter = T,
                    plot = F,
                    # plot = T,
                    expand = T) %>% 
    
    # nothing much to talk about things that have no annotations ---
    filter(annot_desc!="no_desc") %>%
    # save the results to a file
    arrange(annot_desc) %>% 
    as.data.frame()
    
  }
  
  
  # writeLines(paste0("\n", "n(overrepresented terms) = ", nrow(gsoi.3[[i]]), "\n"))
  # print(overrepresented.terms %>% as_tibble())
  
  # Stacked zplot
      stacked.plot1 <- genes %>% stacked.zplot_tc6(cond = sampleName[[1]], plot.mean = F) %>% pluck(1)
      # stacked.plot2 <- genes %>% stacked.zplot_tc6(cond = sampleName[[2]]) %>% pluck(1)
      ggpubr::ggarrange(plotlist=list(stacked.plot1),
                    nrow = 1, ncol = 1,
                    widths = c(1,1), labels = NA) %>% 
        print()
      
      
  # phase plots
    
    # Get the phases of the best matched waveforms
    phase.ejtk <- circular::circular(rhy.24.sig[[2]]$Phase, units="hours", template="clock24")
  
    # save all the circular phases in a list
    l.phases[[i]] <- phase.ejtk
    # let's name the list elements for later use and reference
    names(l.phases)[[i]] <- paste0(sample.name[[2]],"_cluster-",i)
  
    means <- as.numeric(mean(phase.ejtk))
    means <- circular(means, units="hours", template="clock24")
  
    
    # define phase levels
    ordered_phases <- c("2","4","6","8","10","12",
                        "14","16","18","20","22","24")
      
    df.test <-
      l.phases[[i]] %>%
      as.data.frame() %>% 
      mutate(phase = x) %>%
      mutate(phase = replace(phase, x=="0", "24")) %>% 
      select(-x) %>% 
      mutate(phase=factor(phase, levels = ordered_phases, labels = ordered_phases)) %>%
      group_by(phase) %>%
      summarise(n_genes = n()) %>% 
      arrange(phase)
  
    m <- as.numeric(means)
      
    g <-
      ggplot(df.test, aes(x=phase, y=n_genes)) + 
      # indicate light-dark phase
      geom_rect(aes(xmin = as.factor(12), xmax = as.factor(24), ymin = -Inf, ymax = Inf),
                fill = "grey80", alpha = 0.05, color=NA) +
      geom_bar(stat='identity', fill=col.scheme[[2]]) +
      
      xlab(c(names(l.phases)[[i]])) +
      
      # scale_y_continuous(n.breaks = 3) +
      # scale_y_continuous(breaks = c(0,300,600), limits = c(0,700)) +
      
      scale_x_discrete(limits = ordered_phases) +
      
      coord_polar(theta = "x", direction = 1, start = 0.25) +
      theme_Publication(base_size = 20) +
      theme(# axis.title.x=element_blank(),
                # axis.text.x=element_blank(),
                legend.position = "none")
    
    print(g)
  ###  
  ### Phase plot ends
  
      
  # MAKE THE SUPP FILE
  if (nrow(gsoi.3[[i]])!=0) {
  gsoi.3[[i]] <-
    annots[[2]] %>%
    select(gene_name, gene_desc, GOs, pfams, signalP, SSP, TMHMM) %>% 
    filter(gene_name %in% genes) %>% # change here
    left_join(gsoi.3[[i]][,c(1,3)], by="gene_name") %>%  # change here
    select(gene_name, gene_desc, enriched_annot=annot_desc, everything()) %>%
    # mutate(enriched_annot=as.factor(enriched_annot)) %>% 
    arrange(enriched_annot)
  } else {
    gsoi.3[[i]] <-
    annots[[2]] %>%
    select(gene_name, gene_desc, GOs, pfams, signalP, SSP, TMHMM) %>% 
    filter(gene_name %in% genes) %>% # change here
    mutate(annot_desc=NA) %>% 
    select(gene_name, gene_desc, enriched_annot=annot_desc, everything()) %>%
    # mutate(enriched_annot=as.factor(enriched_annot)) %>% 
    arrange(enriched_annot)
  }
}
## Species: ophio_cflo
## 24h-rhythmic genes, Cluster: 1
## n(genes) = 833
## 
## [1] "Loading annotation file for Ophiocordyceps camponoti-floridani"
## [1] "Done."
## [1] "Testing for enrichment..."

## Species: ophio_cflo
## 24h-rhythmic genes, Cluster: 2
## n(genes) = 465
## 
## [1] "Loading annotation file for Ophiocordyceps camponoti-floridani"
## [1] "Done."
## [1] "Testing for enrichment..."

## Species: ophio_cflo
## 24h-rhythmic genes, Cluster: 3
## n(genes) = 354
## 
## [1] "Loading annotation file for Ophiocordyceps camponoti-floridani"
## [1] "Done."
## [1] "Testing for enrichment..."

## Species: ophio_cflo
## 24h-rhythmic genes, Cluster: 4
## n(genes) = 633
## 
## [1] "Loading annotation file for Ophiocordyceps camponoti-floridani"
## [1] "Done."
## [1] "Testing for enrichment..."


2. Ocflo v. Okim

## read the ocflo to okim homology data
ocflo.okim.homology <-
  read.csv(paste0(path_to_repo, "/supplement/1_Supplementary_file_ophio_cflo_TC6_data.csv"),
           header = T, stringsAsFactors = F) %>% as_tibble() %>% 
  select(ocflo_gene=gene_ID_ncbi, ophio_kim_homolog) %>% 
  left_join(ophio_kim_annots[1:2], by=c("ophio_kim_homolog"="sc16a_gene")) %>% 
  select(ocflo_gene, okim_gene=gene_name) %>% 
  na.omit() %>% 
  distinct()

# define gene sets for fisher's test
set1 <- unique(rhy[[2]]) # rhy24 in ocflo
set2 <- ocflo.okim.homology %>% filter(okim_gene %in% rhy24.okim) %>% pull(ocflo_gene) %>% unique() # rhy24 in okim

## Are the 24h-rhythmic genes in ocflo and okim similar?
fishers_overlap(set1, 
                set2,
                bg.genes = max(length(expressed[[2]]), length(expressed.okim)))
## Contingency table:
## 
##           in.set.2 not.set.2
## in.set.1       357      1928
## not.set.1      772      5093
## Running fisher.test() on contingency table:
## 
## 
##  Fisher's Exact Test for Count Data
## 
## data:  test.table
## p-value = 0.004285
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
##  1.063039 1.401935
## sample estimates:
## odds ratio 
##   1.221501
## What is enriched in the 24h-rhythmic genes overlapping in Ocflo and Okim?
intersect(set1, set2) %>% 
  check_enrichment(.,
                   org = "ophio_cflo",
                   what = "pfams", 
                   bg = "all", # because homologs are identified in the entire genome of the fungus, not only expressed genes!
                   expand = F)
## [1] "Loading annotation file for Ophiocordyceps camponoti-floridani"
## [1] "Done."
## [1] "Testing for enrichment..."

## # A tibble: 3 x 7
##   annot_term annot_desc  adj_pVal sam_freq back_freq n_annot_bg gene_name       
##   <chr>      <chr>          <dbl>    <dbl>     <dbl>      <int> <chr>           
## 1 PF04082    Fungal_tra…  0.00053    0.043     0.012         55 GQ602_000498, G…
## 2 PF00172    Zn_clus      0.00186    0.047     0.016         78 GQ602_000498, G…
## 3 PF13450    NAD_bindin…  0.00494    0.026     0.008         36 GQ602_001211, G…

Unique rhy ocflo / okim

What are the functions of the genes that show sig. 24h-rhythms uniquely in one species and not the other?

We explore functions of the genes that are rhythmic in Okim but not in Ocflo.

# ## rhy-unique-ocflo
# # pfams1 <-
# setdiff(set1, set2) %>%
# check_enrichment(.,
#                  org = "ophio_cflo",
# what = "pfams", expand = F, verbose = T)

## rhy-unique-okim
# pfams2 <- 
  setdiff(set2, set1) %>%
  check_enrichment(.,
                   org = "ophio_cflo",
                   what = "pfams", 
                   bg = "all", # because homologs are identified in the entire genome of the fungus, not only expressed genes!
                   expand = T) %>% 
  arrange(annot_desc)
## [1] "Loading annotation file for Ophiocordyceps camponoti-floridani"
## [1] "Done."
## [1] "Testing for enrichment..."

## # A tibble: 39 x 3
## # Groups:   gene_name [39]
##    gene_name    gene_desc                             annot_desc   
##    <chr>        <chr>                                 <chr>        
##  1 GQ602_000135 putative enterotoxin                  Enterotoxin_a
##  2 GQ602_000845 putative enterotoxin                  Enterotoxin_a
##  3 GQ602_002072 heat-labile enterotoxin A subunit     Enterotoxin_a
##  4 GQ602_002076 putative enterotoxin                  Enterotoxin_a
##  5 GQ602_002091 putative enterotoxin                  Enterotoxin_a
##  6 GQ602_002388 heat-labile enterotoxin subunit alpha Enterotoxin_a
##  7 GQ602_003198 putative enterotoxin                  Enterotoxin_a
##  8 GQ602_006721 heat-labile enterotoxin IIB, A chain  Enterotoxin_a
##  9 GQ602_007183 putative enterotoxin                  Enterotoxin_a
## 10 GQ602_007416 putative enterotoxin                  Enterotoxin_a
## # … with 29 more rows
  # filter(annot_desc == "GMC_oxred_N; GMC_oxred_C")
  # filter(annot_desc == "p450")
  # filter(annot_desc == "FAD_binding_3")
  # filter(annot_desc == "Kinase-like")
  # filter(annot_desc == "Enterotoxin_a")

### NO SIGNIFICANT OVERLAP BETWEEN pfams1 and pfams2 (no overlapping pfams found; I checked.)

What about the rhythmic genes in Ocflo that do not have a homolog in Okim?

# setdiff(set1, ocflo.okim.homology %>% pull(ocflo_gene)) %>%
#   check_enrichment(.,
#                    org = "ophio_cflo",
#                    what = "pfams",
#                    bg = "all", # because homologs are identified in the entire genome of the fungus, not only expressed genes!
#                    expand = T) %>%
#   arrange(annot_desc)

115 such genes, but no enriched pfams.

Time-of-day preference?

Are these overlapping genes day- or night-active in Ocflo?

clusters <- list()
for (i in 1:n_clusters) {
  clusters[[i]] <- my_gene_col[[2]] %>% rownames_to_column("g") %>% filter(cluster==as.character(i)) %>% pull(g)
  names(clusters)[[i]] <- paste0("Ocflo_24_cluster-",i)
}

overlaps <- list()
overlaps[[1]] <- intersect(set1,set2)
overlaps[[2]] <- setdiff(set1,set2) # rhy in Ocflo but not in Okim
overlaps[[3]] <- setdiff(set1, ocflo.okim.homology %>% pull(ocflo_gene)) # rhy in Ocflo but do not have an homolog in Okim
names(overlaps) <- c("rhy_in_both", "rhy_in_Ocflo_only", "rhy_in_Ocflo_no_homolog")

## Pairwise Fisher's exact test to check for sig. overlap
bg.genes <- expressed[[2]] %>% unique() %>% length() # all expressed genes in Ocflo

#### SAVE FIGURE ####
check_overlap(overlaps, clusters, bg.genes)

## $length.geneset
##             rhy_in_both       rhy_in_Ocflo_only rhy_in_Ocflo_no_homolog 
##                     357                    1928                     115 
##      Ocflo_24_cluster-1      Ocflo_24_cluster-2      Ocflo_24_cluster-3 
##                     833                     465                     354 
##      Ocflo_24_cluster-4 
##                     633 
## 
## $intersection
##                         Ocflo_24_cluster-1 Ocflo_24_cluster-2
## rhy_in_both                            136                103
## rhy_in_Ocflo_only                      697                362
## rhy_in_Ocflo_no_homolog                 42                 25
##                         Ocflo_24_cluster-3 Ocflo_24_cluster-4
## rhy_in_both                             48                 70
## rhy_in_Ocflo_only                      306                563
## rhy_in_Ocflo_no_homolog                 21                 27
## 
## $odds.ratio
##                         Ocflo_24_cluster-1 Ocflo_24_cluster-2
## rhy_in_both                       5.245949           7.029400
## rhy_in_Ocflo_only                20.525183          11.140209
## rhy_in_Ocflo_no_homolog           4.429801           4.066339
##                         Ocflo_24_cluster-3 Ocflo_24_cluster-4
## rhy_in_both                       3.215073           2.632598
## rhy_in_Ocflo_only                19.716838          29.463273
## rhy_in_Ocflo_no_homolog           4.392531           3.177232
## 
## $pval
##                         Ocflo_24_cluster-1 Ocflo_24_cluster-2
## rhy_in_both                   1.248892e-39       3.569171e-41
## rhy_in_Ocflo_only            1.610763e-291      1.125101e-121
## rhy_in_Ocflo_no_homolog       5.033369e-12       8.902155e-08
##                         Ocflo_24_cluster-3 Ocflo_24_cluster-4
## rhy_in_both                   2.408908e-10       1.739147e-10
## rhy_in_Ocflo_only            3.349399e-126      1.552304e-258
## rhy_in_Ocflo_no_homolog       2.242068e-07       2.611106e-06

The results suggest that the homologous genes that oscillate in both Ophio species do not show a specific time-of-day of activity (e.g., day or night-active); rather these genes are distributed throughout.


Save Supp. file 3

# sheet1
# all ocflo_annots file with okim homology, okim rhythmicity, ocflo rhythmicity data

gsoi.3[[5]] <- 
  ophio_cflo_annots %>% 
  
  ## add okim homolog name, if exists (if not, NA)
  left_join(ocflo.okim.homology, by=c("gene_name"="ocflo_gene")) %>% 
  select(ocflo_gene = gene_name, okim_homolog=okim_gene, everything()) %>% 
  
  ## add ocflo_rhy_data (options: yes, NA)
  mutate(rhy24_ocflo = ifelse(ocflo_gene %in% rhy[[2]], "yes",NA)) %>% 
  
  ## add okim rhy data
  mutate(rhy24_okim = ifelse(okim_homolog %in% rhy24.okim, "yes",NA)) %>% 
  arrange(rhy24_ocflo,rhy24_okim) %>% 
  
  
  select(ocflo_gene, okim_homolog, rhy24_ocflo, rhy24_okim, gene_desc, GOs:TMHMM,
         ocflo_arb2_gene = arb2_gene, okim_sc16a_homolog = sc16a_homolog)


# sheets: 2 to 5
# genes in each cluster separately

for (i in 1:4) {
  gsoi.3[[i]] <- 
    gsoi.3[[5]] %>% 
    
    filter(ocflo_gene %in% gsoi.3[[i]][[1]]) %>% 
    
    left_join(gsoi.3[[i]][,c(1,3)], by=c("ocflo_gene"="gene_name")) %>% 
    mutate(ocflo_24h_cluster = as.character(i)) %>% 
    
    select(-rhy24_ocflo) %>% 
    select(ocflo_24h_cluster, ocflo_gene:gene_desc, enriched_annot, everything()) %>% 
    arrange(enriched_annot, rhy24_okim)  
  }
  
# ## Save results of not_expressed genes into an excel file
# supp <- list(
#   "ocflo_okim_homolog_rhy24" = gsoi.3[[5]],
#   "rhy24_ocflo_cluster1" = gsoi.3[[1]],
#   "rhy24_ocflo_cluster2" = gsoi.3[[2]],
#   "rhy24_ocflo_cluster3" = gsoi.3[[3]],
#   "rhy24_ocflo_cluster4" = gsoi.3[[4]]
#   )
# writexl::write_xlsx(supp,
#                     path = paste0(path_to_repo,"/results/00_supplementary_files/03_gsoi_rhy24_ocflo_okim.xlsx"
#                                   ))

SIDE NOTE: reducing redundant GO terms

Note to self: If we use GO terms instead of pfams, we will need to reduce the redundant GO annotations that is present in fungal annotation data.

Two options to do this:

Option #1: Get the list of overrepresented GO terms and their associated p-values and use REVIGO portal online to reduce the redundant terms

Option #2: Use the scripts provided by REVIGO to programmatically run REVIGO using bash/R. For more information see (here)[http://revigo.irb.hr/FAQ.aspx#q07] Status: tried running it via bash, and it didn’t work; NEED TO FIGURE IT OUT.


3. Beau v. Ophio - 24h-rhy

Next, we compare the homologous genes in both the fungi to understand if the rhythmic genes (and processes) in the two fungi are similar or not; also, is there any differences in the daily expression of these genes between the two fungal parasites?

Obtain homology data

# Read the source file
homology.file <- "ophio_beau_homology.csv"
homology.file <- 
  paste0(path_to_repo, "/results/proteinortho/", homology.file) %>% 
  read.csv(., stringsAsFactors = F, na.strings = c(" ","","NA"))

# Clean the source file to keep distinct gene-gene homologs
homology.dat <-
  homology.file %>% 
  # names() %>% 
  select(ophio_gene, beau_gene) %>% 
  na.omit() %>% 
  distinct() %>% 
  group_by(beau_gene) %>% 
  filter(n()==1) %>% 
  select(beau_gene, ophio_gene)

writeLines(paste("Of the", length(unique(ophio_cflo_annots[["gene_name"]])), "genes annotated in Ophio-cflo genome,",
                 "and", length(unique(beau_annots[["gene_name"]])), "genes annotated in Beau genome,\n",
                 nrow(distinct(homology.dat)), "genes show one-to-one orthology"))
## Of the 7455 genes annotated in Ophio-cflo genome, and 10364 genes annotated in Beau genome,
##  5274 genes show one-to-one orthology

Summarize the results

for (i in 1:2){
  
  # exp.dat <- expressed[[i]]
  rhy.dat <- rhy[[i]]
  ortho.dat <- homology.dat %>% pull(i) 

  listInput <- list(rhy.dat, ortho.dat)
  names(listInput) <- c(paste0(sample.name[[i]],"_rhy24"),"is_an_ortholog")
  
  library(UpSetR)
  library(viridis)
  # caste.col <- c("#F23030","#1A80D9")
  upset(fromList(listInput), 
        number.angles = 0, point.size = 3, line.size = 1.5, 
        mainbar.y.label = "Number of overlapping genes", 
        sets.x.label = "Sig. rhy genes", 
        text.scale = c(1.5, # y-axis label ("# overlapping genes")
                       2, # y-axis tick labels ("1000, 2000,..")
                       1.5, # label for histogram ("sig. rhy genes")
                       1, # tick labels for histogram
                       1.5, # set names ("Cflo-brain_08h,..") 
                       1.5),
        sets = names(listInput),
        nintersects = 15,
        keep.order = T,
        sets.bar.color = viridis(1),
        # adding queries
        query.legend = "bottom"
        ) %>% 
    print()
  
  writeLines(paste0(
    "The above plot shows that of the ", length(rhy.dat), " rhythmic genes in ", sample.name[[i]],
    ", how many has an 1:1 ortholog in the other fungal species. \n",
    "Note, orthologous genes were identified using proteinortho "))
}

## The above plot shows that of the 1872 rhythmic genes in beau, how many has an 1:1 ortholog in the other fungal species. 
## Note, orthologous genes were identified using proteinortho

## The above plot shows that of the 2285 rhythmic genes in ophio_cflo, how many has an 1:1 ortholog in the other fungal species. 
## Note, orthologous genes were identified using proteinortho

Rhy24 genes w/o orthologs

First, let’s explore the genes that are 24h-rhythmic in a fungal species but lack a one-to-one ortholog in the other species.

### Gene set of interest 4 ###
gsoi.4 <- list()
# names(gsoi.4) <- "rhy24_genes_ocflo_beau_no_homologs"

rhy.not.homolog <- list()

## loop starts
for(i in 1:2){
  # save the names of genes 
  rhy.not.homolog[[i]] <- setdiff(rhy[[i]], homology.dat[[i]])
  
  # run enrichment
  gsoi.4[[i]] <- 
  rhy.not.homolog[[i]] %>% 
    check_enrichment(.,
                     org=sample.name[[i]],
                     bg = "all",
                     expand = T,
                     filter = T,
                     what = "pfams") %>% 
    
    # nothing much to talk about things that have no annotations ---
    filter(annot_desc!="no_desc") %>%
    
    # save the results to a file
    arrange(annot_desc)
    
    # filter(annot_desc=="Enterotoxin_a") %>% 
    # pull(gene_name) %>% 
    # stacked.zplot_tc6(cond="ophio",plot.mean = F, bg.alpha = 0.5)
    
    # print()
  
  # MAKE THE SUPP FILE
  gsoi.4[[i]] <-
    annots[[i]] %>%
    select(gene_name, gene_desc, GOs, pfams, signalP, SSP, TMHMM) %>% 
    filter(gene_name %in% rhy.not.homolog[[i]]) %>%  # change here
    left_join(gsoi.4[[i]][,c(1,3)], by="gene_name") %>%  # change here
    select(gene_name, gene_desc, enriched_annot=annot_desc, everything()) %>%
    # mutate(enriched_annot=as.factor(enriched_annot)) %>% 
    arrange(enriched_annot)
  
  
}
## [1] "Loading annotation file for Beauveria bassiana"
## [1] "Done."
## [1] "Testing for enrichment..."

## [1] "Loading annotation file for Ophiocordyceps camponoti-floridani"
## [1] "Done."
## [1] "Testing for enrichment..."

## loop ends  
  
# ## Save results of not_expressed genes into an excel file
# supp <- list(
#   "rhy24_beau_no_ortholog" = gsoi.4[[1]],
#   "rhy24_ophio_no_ortholog" = gsoi.4[[2]])
# writexl::write_xlsx(supp,
#                     path = paste0(path_to_repo,
#                                   "/results/00_supplementary_files/04_gsoi_rhy24_beau_ophio_not_ortholog.xlsx"))

Rhy24 genes with orthologs

Next, I wanted to see if certain gene clusters show a characteristic difference in their daily expression in a species-specific manner (meaning, same genes but different daily expression in the two fungal species?).

To do so, I performed hierarchical clustering of all orthologous genes that show sig. 24h rhythms in EITHER Ocflo OR Beau to identify clusters of genes that show a synchronized expression in each of the two fungal species.

rhy.homology.dat <-
  homology.dat %>% 
  filter(beau_gene %in% rhy[[1]] | ophio_gene %in% rhy[[2]])

Hierarchical clustering

### Make the dataframe for plotting
zscore.rhy.homology.dat <-
  zscore.dat[[1]] %>% 
  filter(gene_name %in% rhy.homology.dat[[1]]) %>% 
  rename_at(vars(starts_with("ZT")), ~ (gsub("A", "B", .x, fixed = TRUE))) %>% # fix colnames for beau
  # add ophio homologs for the beau genes
  left_join(rhy.homology.dat, by=c("gene_name" = "beau_gene")) %>%
  # remove the beau names and keep the ophio names only
  select(-1) %>% 
  select(gene_name = ophio_gene, everything()) %>% 
  # join ophio-cflo data
  left_join(zscore.dat[[2]], by="gene_name") %>%
  # drop any genes without expression values (NA)
  na.omit() %>% 
  as.data.frame() %>% 
  # set genes as rownames
  column_to_rownames("gene_name")

# Set genes as rownames and convert it into a matrix
# rownames(zscore.rhy.homology.dat) = zscore.rhy.homology.dat$gene_name
zscore.rhy.homology.dat <- as.matrix(zscore.rhy.homology.dat)


# Hierarchical clustering of the genesets
my_hclust_gene <- hclust(dist(zscore.rhy.homology.dat), method = "complete")


# Make annotations for the heatmaps
n_clusters <- 4
my_clusters <- cutree(tree = as.dendrogram(my_hclust_gene), k = n_clusters) # k=  clusters
my_gene_col <- data.frame(cluster = my_clusters)


# I’ll add some column annotations and create the heatmap.
# Annotations for:
# 1. Is the sample collected during the light or dark phase? 
my_sample_col <- data.frame(phase = rep(rep(c("light", "dark", "light"),c(5,6,1)),2),
                            conds = rep(c("beau", "ophio_cflo"), each=12))
row.names(my_sample_col) <- colnames(zscore.rhy.homology.dat)

# Manual color palette
my_colour = list(
  phase = c(light = "#F2E18D", dark = "#010440"),
  conds = c(beau = "#5A829F", ophio_cflo = "#AD212F"),
  cluster = viridis::cividis(100)[c(seq(10,80,by=round(80/(n_clusters), 0)))])

# Color scale
my.breaks = seq(min(zscore.rhy.homology.dat), max(zscore.rhy.homology.dat), by=0.1)
# my.breaks = seq(min(zscore.rhy), max(zscore.rhy), by=0.06)

# Let's plot!
pheatmap(zscore.rhy.homology.dat, show_rownames = F, show_colnames = F,
           annotation_row = my_gene_col, 
           annotation_col = my_sample_col,
           cutree_rows = n_clusters, # OG was 4
           # cutree_cols = 4,
           gaps_col = c(12,24),
           annotation_colors = my_colour,
           border_color=FALSE,
           cluster_cols = F,
           breaks = my.breaks,
           ## color scheme borrowed from: 
           color = inferno(length(my.breaks) - 1),
           treeheight_row = 0,
           # treeheight_col = 0,
           # remove the color scale or not
           main = paste0("24h-rhythmic in Ocflo or Beau \n (n=",
                         nrow(zscore.rhy.homology.dat), " orthologous genes)"),
           ## annotation legend
           annotation_legend = T,
           ## Color scale
           legend = T)

Individual clusters

### Gene set of interest 5 ###
gsoi.5 <- list()
# names(gsoi.5) <- "rhy24_genes_ocflo_or_beau_homologs"
gsoi.5.beau <- list()
gsoi.5.ophio <- list()

sampleName <- c("ophio_cflo","ophio_ophio-infected")
## loop starts
for (j in 1:2) {
  
  for (i in 1:n_clusters){
    
    writeLines(paste0("Species: ", sample.name[[j]], "\n", "24h-rhythmic genes, Cluster: ", i))
    
    # Summary
    genes <- my_gene_col %>% rownames_to_column("g") %>% filter(cluster==as.character(i)) %>% pull(g)
    writeLines(paste0("n(genes) = ", length(genes),"\n"))
    
    # define the background geneset for enrichment analysis
    bg.genes <- homology.dat %>% pull(ophio_gene) %>% unique()
    
    ## Transform gene names (ophio -> beau) and refine background geneset
    
    if(j==1) {
    genes <- 
        homology.dat %>% 
        filter(ophio_gene %in% genes) %>% 
        pull(beau_gene)
      bg.genes <- homology.dat %>% pull(beau_gene) %>% unique()
      
    # Enrichment - beau
    gsoi.5.beau[[i]] <-
      genes %>%
        check_enrichment(.,
                      what = "pfams",
                      org = sample.name[[j]],
                      bg = expressed[[j]],
                      filter = T,
                      expand = T,
                      plot = T) %>%

    # print(overrepresented.terms)
    # writeLines(paste0("\n", "n(overrepresented terms) = ", nrow(overrepresented.terms), "\n"))

    # nothing much to talk about things that have no annotations ---
    filter(annot_desc!="no_desc") %>%
    # save the results to a file
    arrange(annot_desc)
    
    } else {
    # Enrichment - Ophio
    gsoi.5.ophio[[i]] <-
      genes %>% 
        check_enrichment(.,
                      # what = "GOs",
                      what = "pfams",
                      org = sample.name[[j]],
                      bg = expressed[[j]],
                      filter = T,
                      expand = T,
                      plot = T) %>% 
    
    # print(overrepresented.terms)
    # writeLines(paste0("\n", "n(overrepresented terms) = ", nrow(overrepresented.terms), "\n"))
      
    # nothing much to talk about things that have no annotations ---
    filter(annot_desc!="no_desc") %>%
    # save the results to a file
    arrange(annot_desc)
    
    # MAKE THE SUPP FILE
    gsoi.5[[i]] <-
      annots[[2]] %>%
      select(gene_name, gene_desc, GOs, pfams, signalP, SSP, TMHMM) %>%
      filter(gene_name %in% genes) %>%  # change here
      left_join(gsoi.5.ophio[[i]][,c(1,3)], by="gene_name") %>%  # change here
      select(ophio_gene=gene_name,
             gene_desc, enriched_annot=annot_desc, everything()) %>%
      # mutate(enriched_annot=as.factor(enriched_annot)) %>%
      arrange(enriched_annot) %>% 

        ## add cluster information
        mutate(clusterID = paste0("cluster_",i)) %>% 

        ## add homology data from beau
        left_join(homology.dat, by=c("ophio_gene")) %>%

        ## add rhythmicity data - ophio_cflo
        left_join(rhy.24.sig[[2]] %>%
                    select(ophio_gene=ID, ophio_pval=GammaP) %>%
                    mutate(ophio_rhy24 = ifelse(ophio_pval < FDR/100, "yes", "no")),
                  by=c("ophio_gene")) %>%

        ## add rhythmicity data - beau
        left_join(rhy.24.sig[[1]] %>%
                    select(beau_gene=ID, beau_pval=GammaP) %>%
                    mutate(beau_rhy24 = ifelse(beau_pval < FDR/100, "yes", "no")),
                  by=c("beau_gene")) %>%

        ## arrange the columns
        select(ophio_gene, gene_desc, clusterID, enriched_annot,
               ophio_rhy24, beau_rhy24, GOs:TMHMM, everything()) %>%

        ## arrange by enriched_annot
        arrange(enriched_annot)
    
    }
    
    
    # # Stacked zplot
    # if (j==2) {
    #   # Stacked zplot
    #   stacked.plot1 <- genes %>% stacked.zplot_tc6(cond = sampleName[[1]]) %>% pluck(1)
    #   stacked.plot2 <- genes %>% stacked.zplot_tc6(cond = sampleName[[2]]) %>% pluck(1)
    #   ggpubr::ggarrange(plotlist=list(stacked.plot1, stacked.plot2),
    #                 nrow = 1, ncol = 2,
    #                 widths = c(1,1), labels = NA) %>% 
    #     print()
    # } else {
    # genes %>% 
    # stacked.zplot_tc6(cond = sample.name[[j]]) %>% 
    #   multi.plot(rows = 1, cols = 1)
    # }
  
  }
}
## Species: beau
## 24h-rhythmic genes, Cluster: 1
## n(genes) = 718
## 
## [1] "Loading annotation file for Beauveria bassiana"
## [1] "Done."
## [1] "Testing for enrichment..."

## Species: beau
## 24h-rhythmic genes, Cluster: 2
## n(genes) = 1026
## 
## [1] "Loading annotation file for Beauveria bassiana"
## [1] "Done."
## [1] "Testing for enrichment..."

## Species: beau
## 24h-rhythmic genes, Cluster: 3
## n(genes) = 451
## 
## [1] "Loading annotation file for Beauveria bassiana"
## [1] "Done."
## [1] "Testing for enrichment..."

## Species: beau
## 24h-rhythmic genes, Cluster: 4
## n(genes) = 324
## 
## [1] "Loading annotation file for Beauveria bassiana"
## [1] "Done."
## [1] "Testing for enrichment..."

## Species: ophio_cflo
## 24h-rhythmic genes, Cluster: 1
## n(genes) = 718
## 
## [1] "Loading annotation file for Ophiocordyceps camponoti-floridani"
## [1] "Done."
## [1] "Testing for enrichment..."

## Species: ophio_cflo
## 24h-rhythmic genes, Cluster: 2
## n(genes) = 1026
## 
## [1] "Loading annotation file for Ophiocordyceps camponoti-floridani"
## [1] "Done."
## [1] "Testing for enrichment..."

## Species: ophio_cflo
## 24h-rhythmic genes, Cluster: 3
## n(genes) = 451
## 
## [1] "Loading annotation file for Ophiocordyceps camponoti-floridani"
## [1] "Done."
## [1] "Testing for enrichment..."

## Species: ophio_cflo
## 24h-rhythmic genes, Cluster: 4
## n(genes) = 324
## 
## [1] "Loading annotation file for Ophiocordyceps camponoti-floridani"
## [1] "Done."
## [1] "Testing for enrichment..."

#################
### loop ends ###
#################

# ## Save results of not_expressed genes into an excel file
#   supp <- list(
#     "ortholog_cluster1" = gsoi.5[[1]],
#     "ortholog_cluster2" = gsoi.5[[2]],
#     "ortholog_cluster3" = gsoi.5[[3]],
#     "ortholog_cluster4" = gsoi.5[[4]])
#   writexl::write_xlsx(supp,
#                       path = paste0(path_to_repo,
#                                     "/results/00_supplementary_files/05_gsoi_rhy24_beau_ophio_ortholog.xlsx"))
#   # update counter
#   counter <- counter+1

MAKE SUPP FILES; STOPPED HERE !!!

Overlap b/n clusters and rhy24

## Visualize the overlap

cluster.dat <- list()
for (i in 1:n_clusters) {
    cluster.dat[[i]] <- my_gene_col %>% 
rownames_to_column("g") %>% filter(cluster==as.character(i)) %>% pull(g)
}
names(cluster.dat) <- paste0("Cluster_",1:4)

for (j in 1:2) {
    
    rhy.dat <- rhy[[j]]
    cluster.dat.dummy <- cluster.dat
    
    if (j == 1) {
      for (i in 1:n_clusters) {
        cluster.dat.dummy[[i]] <- 
            homology.dat %>% 
            filter(ophio_gene %in% cluster.dat.dummy[[i]]) %>% 
            pull(beau_gene)
      }
    }
    
    listInput <- list(rhy.dat, 
                      cluster.dat.dummy[[1]], cluster.dat.dummy[[2]], 
                      cluster.dat.dummy[[3]], cluster.dat.dummy[[4]])
    names(listInput) <- c(paste0(sample.name[[j]], c("_rhy24")), paste0("cluster_",1:4))
    
    library(UpSetR)
    library(viridis)
    # caste.col <- c("#F23030","#1A80D9")
    upset(fromList(listInput), 
          number.angles = 0, point.size = 3, line.size = 1.5, 
          mainbar.y.label = "Number of overlapping genes", 
          sets.x.label = "Sig. rhy genes", 
          text.scale = c(1.5, # y-axis label ("# overlapping genes")
                         2, # y-axis tick labels ("1000, 2000,..")
                         1.5, # label for histogram ("sig. rhy genes")
                         1, # tick labels for histogram
                         1.5, # set names ("Cflo-brain_08h,..") 
                         1.5),
          sets = names(listInput),
          nintersects = 15,
          keep.order = T,
          sets.bar.color = viridis(1),
          # adding queries
          query.legend = "bottom"
          ) %>% 
      print()
  
}

It seems that majority of the genes in Cluster 3 and 4 are sig. rhythmic in Ophio but not in Beau. We will perform the pairwise Fisher’s exact test to find out. Let’s dig in!

NOTE: We need to think about the best way to perform the Fisher’s exact test. For starters, I am transforming all the gene names to Ophio

# LIST ONE - Cluster identity
list1 <- cluster.dat
names(list1) <- names(cluster.dat)

## LIST TWO - ophio rhythmic genes
beau.ortho.rhy <- homology.dat %>% filter(beau_gene %in% rhy[[1]]) %>% pull(ophio_gene) %>% unique() 
ocflo.ortho.rhy <- homology.dat %>% filter(ophio_gene %in% rhy[[2]]) %>% pull(ophio_gene) %>% unique()
list2 <- list(beau.ortho.rhy, ocflo.ortho.rhy)
names(list2) <- paste0(sample.name[1:2], c("_24h"))

## CHECK FOR OVERLAP
library(GeneOverlap)

# define the background geneset 
# in our case, it would be the number of orthologous genes between beau and Ophio_cflo
nGenes = homology.dat %>% pull(ophio_gene) %>% unique() %>% length()

## make a GOM object
gom.1v2 <- newGOM(list1, list2, genome.size = nGenes)
png(paste0(path_to_repo, "/results/figures/BD/ocflo_beau_orthologs_rhy_overlap.png"),
    width = 15, height = 15, units = "cm", res = 300)
drawHeatmap(gom.1v2,
              adj.p=T,
              cutoff=0.01,
              what="odds.ratio",
              # what="Jaccard",
              log.scale = T,
              note.col = "grey60")
trash <- dev.off()
Orthologous rhy24 genes

Orthologous rhy24 genes

As we predicted, Cluster 3 and 4 genes show a stronger overlap with 24h-rhythmic genes in O. cflo as comapred to Beauveria (as can be seen from both log2-odds ratio and the associated q-value). The signal is strongest for Cluster 4, so let’s see which genes are in this cluster.

Cluster-4 genes

## Get the annotation data
ocflo.annots <- read.csv(paste0(path_to_repo, "/data/ophio_cflo_TC6_data.csv"), stringsAsFactors = F) %>% as.tibble()
ocflo.annots %<>% 
  filter(expressed=="yes") %>%
  select(gene_name = gene_ID_ncbi, gene_ID_robin, blast_annot, GammaP_24h, GOs:ophio_kim_homolog)


## Check these genes for other annotations (signalP, SSP, TMHMM)
# LIST ONE - Cluster identity
list1 <- cluster.dat
names(list1) <- names(cluster.dat)

## LIST TWO - ophio rhythmic genes
signalP <- ocflo.annots %>% filter(signalP == "yes") %>% pull(gene_name)
SSP <- ocflo.annots %>% filter(SSP == "yes") %>% pull(gene_name)
TMHMM <- ocflo.annots %>% filter(TMHMM == "yes") %>% pull(gene_name)
list2 <- list(signalP, SSP, TMHMM)
names(list2) <- paste0(sample.name[[2]], "-", c("signalP", "SSP", "TMHMM"))

## CHECK FOR OVERLAP
library(GeneOverlap)

# define the background geneset 
# in our case, it would be the number of orthologous genes between beau and Ophio_cflo
nGenes = ocflo.annots %>% nrow() 

## make a GOM object
gom <- newGOM(list1, list2, genome.size = nGenes)
png(paste0(path_to_repo, "/results/figures/BD/ocflo_beau_orthologs_annots_overlap.png"),
    width = 15, height = 15, units = "cm", res = 300)
drawHeatmap(gom,
              adj.p=T,
              cutoff=0.01,
              what="odds.ratio",
              # what="Jaccard",
              log.scale = T,
              note.col = "grey60")
trash <- dev.off()
Overlap of orthologous rhy24 gene clusters with additional annotations

Overlap of orthologous rhy24 gene clusters with additional annotations

4. Ophio DEGs during infection

Do prep for analyses

Prepare the functions, libraries required

# Let's load functions for running limorhyde
source(system.file('extdata', 'vignette_functions.R', package = 'limorhyde'))
# Let's load the libraries required for running Limorhyde
# library('annotate')
library('data.table')
library('foreach')
# library('GEOquery')
library('ggplot2')
library('knitr')
library('limma')
library('limorhyde')

conflict_prefer("union", "dplyr")

Format metadata

Create dataframe with metadata information for the different samples collected

sampleName <- c("ophio_cflo","ophio_ophio-infected")
short.name <- c("AC","AI") # AC = arb2-control, AI = arb2-infection
time.points <- c(2,4,6,8,10,12,14,16,18,20,22,24)
light.dark <- c(rep("light",times=5), rep("dark",times=6), rep("light", times=1))

meta <- data.frame(title = paste0(rep(sampleName, each=12),"_ZT",time.points),
                       sample = paste0(rep(time.points, times=2),rep(short.name, each=12)),
                       genotype = rep(sampleName, each=12),
                       time = rep(time.points, times=2),
                       cond = rep(sampleName, each=12),
                       LD = rep(light.dark, times=2),
                       stringsAsFactors = F)

meta %>% glimpse()
## Observations: 24
## Variables: 6
## $ title    <chr> "ophio_cflo_ZT2", "ophio_cflo_ZT4", "ophio_cflo_ZT6", "ophio…
## $ sample   <chr> "2AC", "4AC", "6AC", "8AC", "10AC", "12AC", "14AC", "16AC", …
## $ genotype <chr> "ophio_cflo", "ophio_cflo", "ophio_cflo", "ophio_cflo", "oph…
## $ time     <dbl> 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 2, 4, 6, 8, 10, …
## $ cond     <chr> "ophio_cflo", "ophio_cflo", "ophio_cflo", "ophio_cflo", "oph…
## $ LD       <chr> "light", "light", "light", "light", "light", "dark", "dark",…

Now, format the metadata.

### 1.1.1 Format the meta-data ----------------
# load the meta-data
sm <- meta
# Let's format the columns in the right data-type
sm$time <- as.numeric(sm$time)
# sm$batch <- as.factor(sm$batch)
sm$LD <- as.factor(sm$LD)
# sm$location <- as.factor(sm$location)

# Let's get a glimpse of the metadata
sm %>% as_tibble() %>% head()
## # A tibble: 6 x 6
##   title           sample genotype    time cond       LD   
##   <chr>           <chr>  <chr>      <dbl> <chr>      <fct>
## 1 ophio_cflo_ZT2  2AC    ophio_cflo     2 ophio_cflo light
## 2 ophio_cflo_ZT4  4AC    ophio_cflo     4 ophio_cflo light
## 3 ophio_cflo_ZT6  6AC    ophio_cflo     6 ophio_cflo light
## 4 ophio_cflo_ZT8  8AC    ophio_cflo     8 ophio_cflo light
## 5 ophio_cflo_ZT10 10AC   ophio_cflo    10 ophio_cflo light
## 6 ophio_cflo_ZT12 12AC   ophio_cflo    12 ophio_cflo dark
# Next we use limorhyde to calculate time_cos and time_sin, which are based on the first
#harmonic of a Fourier decomposition of the time column, and append them to the sm data frame.
sm = cbind(sm, limorhyde(sm$time, 'time_'))
# convert the dataframe into a data.table
sm <- data.table(sm)
# check that it worked
sm[1:5, ]
##              title sample   genotype time       cond    LD      time_cos
## 1:  ophio_cflo_ZT2    2AC ophio_cflo    2 ophio_cflo light  8.660254e-01
## 2:  ophio_cflo_ZT4    4AC ophio_cflo    4 ophio_cflo light  5.000000e-01
## 3:  ophio_cflo_ZT6    6AC ophio_cflo    6 ophio_cflo light  6.123234e-17
## 4:  ophio_cflo_ZT8    8AC ophio_cflo    8 ophio_cflo light -5.000000e-01
## 5: ophio_cflo_ZT10   10AC ophio_cflo   10 ophio_cflo light -8.660254e-01
##     time_sin
## 1: 0.5000000
## 2: 0.8660254
## 3: 1.0000000
## 4: 0.8660254
## 5: 0.5000000

load data

## DATASET 1
## Load the control O.cflo data (from TC6)
ocflo.control.dat <-
  data.db %>%
  tbl(., paste0(sampleName[[1]], "_fpkm")) %>%
  select(gene_name, everything()) %>%
  collect()

## DATASET 2
## Load the O.cflo infection data from the mixed transcriptomics study (from TC7)
# src_dbi(inf.db)
# extract the (gene-expr X time-point) data
ocflo.inf.dat <-
  inf.db %>%
  tbl(., paste0(sampleName[[2]], "_fpkm")) %>%
  select(gene_name, everything()) %>%
  collect()

filter data

The goal is to use only the genes that show expression (>1 FPKM) for at least half of the timepoints during the 24h day (i.e., 6 of the 12 timepoints).

## DATASET 1
n.exp.1 <- apply(ocflo.control.dat[-1], 1, function(x) sum(x>=1))
ocflo.control.dat <- ocflo.control.dat[which(n.exp.1>=6),]
colnames(ocflo.control.dat)[-1] <- paste0("ZT", meta[meta$cond==sampleName[[1]],] %>% pull(sample))

## DATASET 2
n.exp.2 <- apply(ocflo.inf.dat[-1], 1, function(x) sum(x>=1))
ocflo.inf.dat <- ocflo.inf.dat[which(n.exp.2>=6),]
colnames(ocflo.inf.dat)[-1] <- paste0("ZT", meta[meta$cond==sampleName[[2]],] %>% pull(sample))

## Use the genes that are expressed in both conditions
emat <-
  ocflo.control.dat %>%
  filter(gene_name %in% ocflo.inf.dat$gene_name) %>%
  left_join(ocflo.inf.dat, by="gene_name") %>%
  as.data.frame()

Run analyses

Next, let’s perform the DEG analyses for the ophio-cflo (halfway through infection v. controls)

### Convert to a matrix

# save gene names as row names
rownames(emat) <- emat[,1]
emat <- emat[,-1]
# Need to make the emat into a matrix.
emat <- data.matrix(emat)
# log2 transform the data
emat <- log2(emat + 1)


### Set thresholds
# Set threshold for q-value and log2FC
q.threshold <- 0.05 # currently, using 5% FDR (BH adjusted p-value)
log2.foldchange <- 1 # thus, any gene with a 2^(log2.foldchange) fold change in it's expression

### Format the metadata, if necessary
# Filter the metadata according to your comparison
sm.sub <- sm %>% filter(cond %in% c(sampleName))
# Define the cond column as a factor
sm.sub$cond <- as.factor(sm.sub$cond)

### Let's run the DEG analyses
# Use the subsetted emat to find DEGs
design.deg = model.matrix(~ cond + time_cos + time_sin, data = sm.sub)
#
fit = lmFit(emat, design.deg)
fit = eBayes(fit, trend = TRUE)
# Take a look at the coefficients table
# fit$coefficients %>% head()
#
deLimma.deg = data.table(topTable(fit, coef = 2, number = Inf), keep.rownames = TRUE)
setnames(deLimma.deg, 'rn', 'gene_name')
deLimma.deg[, adj.P.Val := p.adjust(P.Value, method = 'BH')]
setorderv(deLimma.deg, 'adj.P.Val')

### Annotate the results
# Annotate the results to indicate the significant genes
all.DEGs <-
  deLimma.deg %>%
  arrange(desc(abs(logFC)), adj.P.Val) %>%
  mutate(sig = as.factor(ifelse(adj.P.Val < q.threshold & abs(logFC) >= log2.foldchange, "yes", "no"))) %>%
  mutate(inf_v_control = as.factor(ifelse(sig=="yes", ifelse( logFC > 0, "up", "down" ), "NA"))) %>%
  mutate(inf = sampleName[[2]])


# ## Save the DEG results to a Rdata file
# save(all.DEGs, file = paste0(path_to_repo, "/results/deg_analysis/ocflo_inf_v_controls.RData"))
# ## load the results of the DEG analysis
# load(file = paste0(path_to_repo, "/results/deg_analysis/ocflo_inf_v_controls.RData"))


### Summarize the results
writeLines(paste0("\nControl-", sampleName[[1]], " v. ", sampleName[[2]], "\n--Results of DEG analysis--"))
## 
## Control-ophio_cflo v. ophio_ophio-infected
## --Results of DEG analysis--
## How many DEGs - 5% FDR and ≥ 1 fold change in gene expression
all.DEGs %>% 
  # filter(adj.P.Val < q.threshold) %>%
  # filter(abs(logFC) >= 2) %>% # change the criteria here for top DEG or all DEG (logFC≥1)
  filter(sig == "yes") %>% 
  
  # pull(gene_name) %>% 
  
  group_by(inf_v_control) %>% 
  summarise(n_genes = n()) %>% 
  as.data.frame() %>% 
    ## n = 81 up- and 141 down-regulated genes in Cflo heads during Ophio-infection 
    ## (at 5% FDR; log2-fold-change ≥ 1) 
  print()
##   inf_v_control n_genes
## 1          down     395
## 2            up     318
### Subset to keep only sig. DEGs
sig.DEGs <- all.DEGs %>% filter(sig=="yes")

Visualize results

# Volcano plot 
library(viridis)

ggplot(all.DEGs) +
  # geom_hline(yintercept = -log10(0.05), col="red", alpha=0.6) +
  # geom_vline(xintercept = c(-2,2), col="grey60", alpha=0.75) +
  geom_point(aes(x = logFC, y = -log10(adj.P.Val), color=sig), size = 1.5, alpha = 0.5) +
  labs(x = expression(log[2]*' fold-change (inf_v_control)'), 
       y = expression(-log[10]*' '*q[DE]),
       title = "O.cflo (infection v. control)",
       color = "significant") +
  # scale_x_continuous(limits = c(-5,3),
  #                    breaks = c(-5,-4,-3,-2,-1,0,1,2,3),
  #                    labels = c("-5","","-3","","-1","","1","","3")) +
  # xlim(c(-50,50)) +
  theme_Publication() +
  scale_color_viridis(discrete = T, direction = -1, option = "viridis")

Load manip data

## Load the ophio DEG (at manipulation) data from Will et al. 2020
will2020_data <- read.csv(paste0(path_to_repo,
                                 "/data/input/ophio_cflo/complete_annotations/FullBlast_EC05_RNAseq_orignal_copy_26Aug19.csv"),
                          stringsAsFactors = F)
will2020_data %<>% 
  as_tibble() %>% 
  filter(sample_1=="Alive" & sample_2=="Fungus") %>%
  select(arb2_gene, logFC = log2.fold_change., q_value, significant) %>% 
  mutate(logFC=as.numeric(logFC), q_value=as.numeric(q_value)) %>%
  filter(significant=="yes") %>% 
  mutate(up_down = ifelse(logFC > 0, "down", "up")) %>%
  mutate(logFC = -1*logFC) %>% 
  na.omit()

### Change ophio gene names to ncbi IDs
will2020_data %<>% 
  left_join(ocflo.annots[1:3], by=c("arb2_gene"="gene_ID_robin")) %>%
  select(-1) %>% 
  select(gene_name, blast_annot, everything())

5. Overlap between DEGs during infection v. manipulation

### Subset the up/down-regulated genes
### At halfway-through disease progression
inf.up <- sig.DEGs %>% filter(inf_v_control=="up") %>% pull(gene_name) %>% na.omit()
inf.down <- sig.DEGs %>% filter(inf_v_control=="down") %>% pull(gene_name) %>% na.omit()
### At active manipulation
manip.up <- will2020_data %>% filter(up_down=="up") %>% pull(gene_name) %>% na.omit()
manip.down <- will2020_data %>% filter(up_down=="down") %>% pull(gene_name) %>% na.omit()


### Visualize the results
listInput <- list(inf.up, inf.down, manip.up, manip.down)
names(listInput) <- c(paste0("inf_",c("up","down")), paste0("manip_", c("up","down")))
    
library(UpSetR)
library(viridis)
upset(fromList(listInput), 
  number.angles = 0, point.size = 3, line.size = 1.5, 
  mainbar.y.label = "Number of overlapping genes", 
  sets.x.label = "Sig. DE genes", 
  text.scale = c(1.5, # y-axis label ("# overlapping genes")
                 2, # y-axis tick labels ("1000, 2000,..")
                 1.5, # label for histogram ("sig. rhy genes")
                 1, # tick labels for histogram
                 1.5, # set names ("Cflo-brain_08h,..") 
                 1.5),
  sets = names(listInput),
  nintersects = 15,
  keep.order = T,
  sets.bar.color = viridis(1),
  # adding queries
  query.legend = "bottom"
  ) %>% 
print()

### Test significance of overlap
list1 <- list(inf.up, inf.down)
names(list1) <- paste0("inf_",c("up","down"))
list2 <- list(manip.up, manip.down)
names(list2) <- paste0("manip_", c("up","down"))
bg.genes <- all.DEGs %>% nrow()

overlap <- check_overlap(list1, list2, bg.genes)

6. Daily exp (DEGs at inf)

Let’s plot the daily expression of the DEGs (ocflo controls v. during infection)

# inf.down %>% 
#   # stacked.zplot_tc6(cond = "inf")
#   stacked.zplot_tc6(cond = "ophio")

Clustering of DEGs

  • perform hierarchical clustering of DE genes into four clusters;
  • plot time-course heatmaps for the clustered 24h-rhythmic geneset
  • Identify the day-peaking and night-peaking clusters visually.
## Get the ocflo infection timecourse data (log2fpkm)
inf.dat <- inf.db %>% tbl(., paste0(sampleName[[2]], "_log2fpkm")) %>% collect()
colnames(inf.dat)[-1] <- paste0("ZT",meta[meta$cond==sampleName[[2]],] %>% pull(sample))

## Get the ocflo CONTROL timecourse data (log2fpkm)
control.dat <- data.db %>% tbl(., paste0(sampleName[[1]], "_log2fpkm")) %>% collect()
colnames(control.dat)[-1] <- paste0("ZT",meta[meta$cond==sampleName[[1]],] %>% pull(sample))


## Specify parameters
n_clusters <- 4
which.degs <- list(inf.up, inf.down)
names(which.degs) <- c("inf.up", "inf.down")

# which.degs <- list(inf.up, inf.down, manip.up, manip.down)
# names(which.degs) <- c("inf.up", "inf.down", "manip.up", "manip.down")
  
for (i in 1:length(which.degs)) {
    
    ## Which genes to look at?
    
    # which.genes <- c(inf.up,inf.down)
    which.genes <- which.degs[[i]]
    
    ### Make the dataframe for plotting
    deg.dat <-
      control.dat %>% 
      filter(gene_name %in% which.genes) %>% 
      # add data from infection
      left_join(inf.dat, by="gene_name") %>% 
      # drop any genes without expression values (NA)
      na.omit() %>% 
      as.data.frame() %>% 
      # set genes as rownames
      column_to_rownames("gene_name") 
    
    # Set genes as rownames and convert it into a matrix
    # rownames(zscore.rhy.homology.dat) = zscore.rhy.homology.dat$gene_name
    deg.dat <- as.matrix(deg.dat)
    
    # Hierarchical clustering of the genesets
    my_hclust_gene <- hclust(dist(deg.dat), method = "complete")
    
    # Make annotations for the heatmaps
    my_clusters <- cutree(tree = as.dendrogram(my_hclust_gene), k = n_clusters) # k= number of clusters
    my_gene_col <- data.frame(cluster = my_clusters)
    
    
    # I’ll add some column annotations and create the heatmap.
    # Annotations for:
    # 1. Is the sample collected during the light or dark phase? 
    my_sample_col <- data.frame(phase = rep(rep(c("light", "dark", "light"),c(5,6,1)),2),
                                conds = rep(c("ocflo_controls", "ocflo_infection"), each=12))
    row.names(my_sample_col) <- colnames(deg.dat)
    
    # Manual color palette
    my_colour = list(
      phase = c(light = "#F2E18D", dark = "#010440"),
      conds = c(ocflo_controls = col.scheme[[2]], ocflo_infection = col.scheme[[3]]),
      cluster = viridis::cividis(100)[c(seq(10,80,by=round(80/(n_clusters), 0)))])
    
    # Color scale
    my.breaks = seq(min(deg.dat), max(deg.dat), by=2)
    # my.breaks = seq(min(zscore.rhy), max(zscore.rhy), by=0.06)
    
    # Let's plot!
    pheatmap(deg.dat, show_rownames = F, show_colnames = F,
               annotation_row = my_gene_col, 
               annotation_col = my_sample_col,
               # cutree_rows = n_clusters, # OG was 4
               # cutree_cols = 2,
               gaps_col = c(12,24),
               annotation_colors = my_colour,
               border_color=FALSE,
               cluster_cols = F,
               breaks = my.breaks,
               ## color scheme borrowed from: 
               color = inferno(length(my.breaks) - 1),
               treeheight_row = 0,
               # treeheight_col = 0,
               # remove the color scale or not
               main = paste0("DEGs - ",names(which.degs)[[i]], "\n (n=",nrow(deg.dat), " genes)"),
               ## annotation legend
               annotation_legend = T,
               ## Color scale
               legend = T) %>% 
      print() 
    
    
  #   for (j in 1:n_clusters){
  #   
  #   writeLines(paste0("Which DEGs: ", names(which.degs)[[i]], "\n", "Cluster: ", j))
  #   
  #   # Summary
  #   genes <- my_gene_col %>% rownames_to_column("g") %>% filter(cluster==as.character(j)) %>% pull(g)
  #   writeLines(paste0("n(genes) = ", length(genes),"\n"))
  #   
  #   # define the background geneset for enrichment analysis
  #   bg.genes <- all.DEGs %>% pull(gene_name)
  #   
  #   # Enrichment
  #   overrepresented.terms <-
  #     genes %>% 
  #       check_enrichment(.,
  #                     what = "pfams",
  #                     org = sampleName[[1]],
  #                     bg = bg.genes,
  #                     filter = T,
  #                     expand = T,
  #                     plot = T)
  #   
  #   # writeLines(paste0("\n", "n(overrepresented terms) = ", nrow(overrepresented.terms), "\n"))
  #   # overrepresented.terms %>% print()
  #   
  #   # Stacked zplot
  #   stacked.plot1 <- genes %>% stacked.zplot_tc6(cond = sampleName[[1]]) %>% pluck(1)
  #   stacked.plot2 <- genes %>% stacked.zplot_tc6(cond = sampleName[[2]]) %>% pluck(1)
  #   ggpubr::ggarrange(plotlist=list(stacked.plot1, stacked.plot2),
  #                 nrow = 1, ncol = 2,
  #                 widths = c(1,1), labels = NA) %>% 
  #     print()
  #   
  # }
 
}

7. Enriched functions of these DEGs

# define the background for enrichment analyses
bg.genes <- rownames(emat)

### Gene set of interest 6 ###
gsoi.6 <- list()

## loop begins
for (i in 1:length(which.degs)) {
  
  gsoi.6[[i]] <-
    which.degs[[i]] %>% 
    check_enrichment(.,
                     org = "ophio_cflo",
                     bg = bg.genes,
                     # what = "GOs",
                     what = "pfams",
                     expand = T) %>% 
    # # nothing much to talk about things that have no annotations ---
    filter(annot_desc!="no_desc") %>%
    # # save the results to a file
    arrange(annot_desc) #%>% head(20) %>% print()
    
  # MAKE THE SUPP FILE
  gsoi.6[[i]] <-
    annots[[2]] %>%
    select(gene_name, gene_desc, GOs, pfams, signalP, SSP, TMHMM) %>%
    filter(gene_name %in% which.degs[[i]]) %>%  # change here
    left_join(gsoi.6[[i]][,c(1,3)], by="gene_name") %>%  # change here
    select(gene_name, gene_desc, enriched_annot=annot_desc, everything()) %>%
    # mutate(enriched_annot=as.factor(enriched_annot)) %>%
    arrange(enriched_annot) %>%

    ## add DEG analyses data
      left_join(all.DEGs %>%
                  select(gene_name,
                         abs_log2FC = logFC,
                         inf_v_control, BH_pval = adj.P.Val),
                by=c("gene_name")) %>%
      mutate(abs_log2FC = abs(abs_log2FC)) %>%
      select(ophio_gene=gene_name, everything()) %>%

      ## add rhythmicity data - ophio_cflo - controls
        left_join((rhy.24.sig[[2]] %>%
                    dplyr::select(ophio_gene=ID, control_pval=GammaP) %>%
                    mutate(control_rhy24 = ifelse(control_pval < FDR/100, "yes", "no"))),
                  by=c("ophio_gene")) %>%

      ## add rhythmicity data - ophio_cflo - infected
        left_join(rhy.24.sig[[3]] %>%
                    select(ophio_gene=ID, inf_pval=GammaP) %>%
                    mutate(inf_rhy24 = ifelse(inf_pval < FDR/100, "yes", "no")),
                  by=c("ophio_gene")) %>%

      ## arrange the columns
        select(ophio_gene, gene_desc, inf_v_control, abs_log2FC,
               enriched_annot,
               control_rhy24, inf_rhy24, GOs:TMHMM, everything()) %>%

      ## arrange by enriched_annot
       arrange(inf_v_control, desc(abs_log2FC), enriched_annot)
    
  }
## [1] "Loading annotation file for Ophiocordyceps camponoti-floridani"
## [1] "Done."
## [1] "Testing for enrichment..."

## [1] "Loading annotation file for Ophiocordyceps camponoti-floridani"
## [1] "Done."
## [1] "Testing for enrichment..."

## loop ends

# ## Save results of not_expressed genes into an excel file
# supp <- list(
#   "UP_DEG_ophio_control_v_inf" = gsoi.6[[1]],
#   "DOWN_DEG_ophio_control_v_inf" = gsoi.6[[2]])
# writexl::write_xlsx(supp,
#                     path = paste0(path_to_repo,
#                                   "/results/00_supplementary_files/06_gsoi_DEG_ophio_control_v_inf.xlsx"))
# # update counter
# counter <- counter+1

Of the different pfam domains enriched in these DEGs, several (MFS_1, Sugar_tr, NAD_binding_8) were also overrepresented in the differentially rhythmic genes between Ocflo and Beauveria. More specifically, the upregulated gene set was enriched for fungal-specific transcription factors (Fungal_trans), oxidoreductase activity (p450), and Zn_clus genes, in addition to several MFS_1 genes. Whereas, the genes downregulated at the halfway mark were enriched in Pyr_redox_2 genes in addition to MFS_1, Sugar_tr, and NAD_binding_8 genes.

Next, we wanted to looked at specific subsets of these DEGs, and their functions, to identify the putative mechanisms that likely allows Ocflo to incubate (grow) inside its host for such extended periods. Of interest were three sets of DEGs, (1) Genes that are significantly up/down-regulated during infection as well as manipulation, and (2) genes that are up/down-regulated only at infection but not at manipulation, and (3) genes that are downregulated at infection but upregulated at manipulation. The first set of genes that show a consistent up/down-regulation at both infection and manipulation likely is involved in general growth and survival of the fungus inside the host. The second set of genes that are differentially expressed only at infection but not at manipulation might be important for maintaining homeostasis inside the host, maybe via parasite-host synchronization of metabolic functions, to ensure that the infected ant host can survive the long incubation period necessary for Ocflo’s life cycle. Whereas, the third set of genes that are downregulated at the halfway mark but upregulated at manipulation likely contains fungal effectors that are necessary to induce successful manipulation of host behavior (death-grip), but harmful to the host otherwise.

8. DEGs of interest

A. Genes up/down-regulated at infection and manipulation

# UP in both
up.in.both <- intersect(inf.up, manip.up)
writeLines("Genes UP regulated at infection and manipulation.")
## Genes UP regulated at infection and manipulation.
up.in.both %>% 
  check_enrichment(.,
                 org = "ophio_cflo",
                 bg = bg.genes,
                 # what = "GOs",
                 what = "pfams",
                 expand = T) %>% 
  # # nothing much to talk about things that have no annotations ---
  # filter(annot_desc!="no_desc") %>%
  # # save the results to a file
  arrange(annot_desc)
## [1] "Loading annotation file for Ophiocordyceps camponoti-floridani"
## [1] "Done."
## [1] "Testing for enrichment..."

## # A tibble: 22 x 3
## # Groups:   gene_name [22]
##    gene_name   gene_desc                                       annot_desc       
##    <chr>       <chr>                                           <chr>            
##  1 GQ602_0006… fungal specific transcription factor domain-co… Fungal_trans     
##  2 GQ602_0052… fungal specific transcription factor            Fungal_trans     
##  3 GQ602_0069… Casein kinase II, regulatory subunit            Fungal_trans     
##  4 GQ602_0010… Transcription factor, fungi                     Fungal_trans; Zn…
##  5 GQ602_0016… transcriptional activator xlnR                  Fungal_trans; Zn…
##  6 GQ602_0043… binuclear zinc transcription factor             Fungal_trans; Zn…
##  7 GQ602_0007… Tubulin gamma chain                             MFS_1            
##  8 GQ602_0013… related to hexose transporter protein           MFS_1            
##  9 GQ602_0016… Major facilitator superfamily domain, general … MFS_1            
## 10 GQ602_0016… MFS transporter                                 MFS_1            
## # … with 12 more rows
# ## Which genes?
# up.in.both %>% 
#   TC6_annotator() %>%
#   view()


# DOWN in both
down.in.both <- intersect(inf.down, manip.down)
writeLines("Genes DOWN regulated at infection and manipulation.")
## Genes DOWN regulated at infection and manipulation.
down.in.both %>% 
  check_enrichment(.,
                 org = "ophio_cflo",
                 bg = bg.genes,
                 what = "GOs",
                 # what = "pfams",
                 # what = "TMHMM",
                 expand = T) %>% 
  # # nothing much to talk about things that have no annotations ---
  # filter(annot_desc!="no_desc") %>%
  # # save the results to a file
  arrange(annot_desc) %>% print()
## [1] "Loading annotation file for Ophiocordyceps camponoti-floridani"
## [1] "Done."
## [1] "Testing for enrichment..."

## # A tibble: 39 x 3
## # Groups:   gene_name [39]
##    gene_name   gene_desc                             annot_desc                 
##    <chr>       <chr>                                 <chr>                      
##  1 GQ602_0011… phosphoenolpyruvate carboxykinase     carbohydrate metabolic pro…
##  2 GQ602_0012… Ornithine aminotransferase            cofactor binding           
##  3 GQ602_0025… Pyridine nucleotide-disulfide oxidor… cofactor binding           
##  4 GQ602_0035… acyl-CoA dehydrogenase                cofactor binding           
##  5 GQ602_0036… FAD dependent oxidoreductase          cofactor binding           
##  6 GQ602_0037… aldehyde reductase ii                 cofactor binding           
##  7 GQ602_0040… aspartate aminotransferase            cofactor binding           
##  8 GQ602_0062… Mannitol-1-phosphate 5-dehydrogenase  cofactor binding           
##  9 GQ602_0069… thiamine pyrophosphate enzyme         cofactor binding           
## 10 GQ602_0020… indoleamine 2,3-dioxygenase-like pro… cofactor binding; heme bin…
## # … with 29 more rows
# ## Which genes?
# down.in.both %>% 
#   TC6_annotator() %>%
#   view()

UP in both

contains three copies of PTP, the clock gene CK2, and a putative enterotoxin

DOWN in both

contains a copy of PTP (which has an enterotoxin domain), and two copies of putative enterotoxins.

B. Genes up/down-regulated only at infection but not at manipulation

# UP/DOWN at inf only
deg.at.inf.only <- c(setdiff(inf.up, manip.up), setdiff(inf.down, manip.down))

deg.at.inf.only %>% 
  check_enrichment(.,
                 org = "ophio_cflo",
                 bg = bg.genes,
                 what = "GOs",
                 # what = "pfams",
                 expand = T) %>% 
  # # nothing much to talk about things that have no annotations ---
  filter(annot_desc!="no_desc") %>%
  # # save the results to a file
  arrange(annot_desc)
## [1] "Loading annotation file for Ophiocordyceps camponoti-floridani"
## [1] "Done."
## [1] "Testing for enrichment..."

## # A tibble: 28 x 3
## # Groups:   gene_name [28]
##    gene_name   gene_desc                                      annot_desc        
##    <chr>       <chr>                                          <chr>             
##  1 GQ602_0005… related to aquaporin                           transmembrane tra…
##  2 GQ602_0005… Major facilitator superfamily domain, general… transmembrane tra…
##  3 GQ602_0006… sugar transporter STL1                         transmembrane tra…
##  4 GQ602_0006… NCS1 nucleoside transporter family protein     transmembrane tra…
##  5 GQ602_0009… amino acid permease                            transmembrane tra…
##  6 GQ602_0019… Cation efflux protein                          transmembrane tra…
##  7 GQ602_0022… ATP synthase subunit g                         transmembrane tra…
##  8 GQ602_0023… ABC transporter, transmembrane domain, type 1  transmembrane tra…
##  9 GQ602_0023… Sodium/calcium exchanger membrane region       transmembrane tra…
## 10 GQ602_0030… siderophore iron transporter                   transmembrane tra…
## # … with 18 more rows
# ## Which genes?
# deg.at.inf.only %>%
#   TC6_annotator() %>%
#   view()

Toxins

contains three enterotoxins (two heat-labile enteroxins are upregulated and a putative enterotoxin is downregulated at infection), the putative aflatoxin efflux pump AFLT (downregulated), a protoplast regeneration and killer toxin resistance protein (downregulated), a gliotoxin biosynthesis protein GliK (implicated in assisting tissue penetration)[https://link.springer.com/article/10.1023/B:MYCO.0000038434.55764.16].

Clock genes and modulators

contains the core fungal clock gene frequency and vivid (synonymous to ENVOY)

C. Genes down-regulated at infection but up-regulated at manipulation

no enriched pfams or GOs were found.

# UP/DOWN at inf only
down.inf.up.manip <- intersect(inf.down, manip.up)

# down.inf.up.manip %>% 
#   check_enrichment(.,
#                  org = "ophio_cflo",
#                  bg = bg.genes,
#                  what = "GOs",
#                  # what = "pfams",
#                  expand = T) %>% 
#   # # nothing much to talk about things that have no annotations ---
#   filter(annot_desc!="no_desc") %>%
#   # # save the results to a file
#   arrange(annot_desc) %>% view()

# # Any interesting genes?
# down.inf.up.manip %>% 
#   TC6_annotator() %>% 
#     # filter(control_rhy24=="yes") %>% 
#   view()

9. Export genes of interst

We subset all of our Ocflo genes of interest that we want to map to the gene co-expression network that we will build in part-2 (see 02_build_annotate_circadianGCN_ocflo.Rmd)

Three primary sets of Ocflo genes are of interest to us:

  • 24h rhythmic genes (list = rhy.24.sig[[2]])
  • Genes that are classified as rhythmic in both Ocflo and Okim (list = gsoi.3[[5]])
  • Genes that show differential rhythmicity between Ocflo and Beauveria (list = gsoi.5[[c(4:5)]])
  • Genes that are differentially expressed during infection versus controls (all.DEGs)

Let’s collate all of these gene sets into one list and save it as a Rdata file

# gois.tc6 <- 
#   list(
#     "ocflo_rhy24" = rhy.24.sig[[2]] %>% pull(1) %>% unique(),
#     "ocflo_okim_rhy24" = gsoi.3[[5]] %>% filter(rhy24_ocflo=="yes" & rhy24_okim=="yes") %>% pull(1) %>% unique(),
#     "ocflo_beau_DRGs_rhy24_cluster3" = gsoi.5[[3]] %>% pull(1) %>% unique(),
#     "ocflo_beau_DRGs_rhy24_cluster4" = gsoi.5[[4]] %>% pull(1) %>% unique(),
#     "ocflo_DEGs_up_at_infection" = inf.up,
#     "ocflo_DEGs_down_at_infection" = inf.down,
#     "ocflo_DEGs_up_in_both" = up.in.both,
#     "ocflo_DEGs_down_in_both" = down.in.both,
#     "ocflo_DEGs_at_inf_only" = deg.at.inf.only,
#     "ocflo_DEGs_down_inf_up_manip" = down.inf.up.manip,
#     "ocflo_DEGs_up_at_manip" = manip.up,
#     "ocflo_DEGs_down_at_manip" = manip.down
#   )
# 
# ## save this file
# save(gois.tc6,
#      file = paste0(path_to_repo, "/results/genes_of_interest/goi_list.RData"))